# This file replicates all analyses from the main paper.

#Please open the R-Project "Replication_STC.Rproj" first and load the "README.Rmd" to set the working directory.
getwd()

rm(list=ls())

library(tidyverse)
library(ggtext)
library(glue)
library(stargazer)


'%!in%' <- function(x,y)!('%in%'(x,y))
highlight = function(x, pat, color="black", family="") {
  ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}

stc <- read_rds("stc.rds")

stc_raw <- readRDS(file = "not_shareable/stc_raw.rds")

# 1) Analyses for the main-paper ####

# 1.1) Figure 2: Data availability ####

stc %>%
  group_by(country_short, year) %>%
  summarise(distance_cntry = mean(distance, na.rm=T),
            estimated = mean(estimated, na.rm=T)) %>%
  #filter(year != 1999) %>%
  ggplot(., aes(x=year, y = forcats::fct_rev(country_short), fill = distance_cntry)) +
  geom_raster() + 
  #  geom_text(aes(label=round(estimated, 1)), colour = "white") + 
  viridis::scale_fill_viridis("Distance to last observed value") +
  ggthemes::theme_tufte() + 
  labs(x = "Year", y = "Country") + 
  theme(legend.position = "bottom") + 
  scale_x_continuous(breaks = seq(from = 2000, to = 2018, by = 2)) +  
  NULL

ggsave("output/Figure2.pdf",
       width = 22, height = 26, units = "cm")

# 1.2) Figure 3: Within-country range of subnational trade competitiveness ####

stc %>%
  filter(!grepl("-NAT", subnational_code), aggregation_level == 1) %>%
  group_by(country_short) %>%
  summarise(ra_stc_sym = max(stc_sym, na.rm=T) - min(stc_sym, na.rm=T),
            ra_stc_add = max(stc_add, na.rm=T) - min(stc_add, na.rm=T),
            ra_stc_net = max(stc_net, na.rm=T) - min(stc_net, na.rm=T),
            ra_stc_tba = max(stc_tba, na.rm=T) - min(stc_tba, na.rm=T),
            sum = ra_stc_sym + ra_stc_add + ra_stc_net + ra_stc_tba) %>%
  gather(ra_stc_sym:ra_stc_tba, key = "measure", value = "range") %>%
  dplyr::left_join(., stc %>%
                     filter(!grepl("-NAT", subnational_code), aggregation_level == 1) %>%
                     group_by(country_short) %>%
                     summarise(ra_stc_sym = max(stc_sym, na.rm=T) - min(stc_sym, na.rm=T),
                               ra_stc_add = max(stc_add, na.rm=T) - min(stc_add, na.rm=T),
                               ra_stc_net = max(stc_net, na.rm=T) - min(stc_net, na.rm=T),
                               ra_stc_tba = max(stc_tba, na.rm=T) - min(stc_tba, na.rm=T),
                               sum = ra_stc_sym + ra_stc_add + ra_stc_net + ra_stc_tba) %>%
                     gather(ra_stc_sym:ra_stc_tba, key = "measure", value = "range") %>%
                     group_by(measure) %>%
                     summarise(mean_measure = mean(range, na.rm=T))) %>%
  mutate(measure= factor(measure, 
                         levels = c("ra_stc_sym", "ra_stc_add",
                                    "ra_stc_net", "ra_stc_tba")),
         type = factor(ifelse(measure %in% c("ra_stc_sym", "ra_stc_add"), "Exports", "Trade~ Balance")),
         measure= factor(measure, 
                         levels = c("ra_stc_sym", "ra_stc_add",
                                    "ra_stc_net", "ra_stc_tba"),
                         labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance))))) %>%
  ggplot(., aes(x=reorder(country_short, sum), y = range))+ 
  geom_bar(stat = "identity", position = position_dodge(), alpha = .75, colour = "black", fill = "white") + 
  geom_hline(aes(yintercept = mean_measure), colour = "black", lty = "dashed", size = 1.25, show.legend = F) +
  theme_minimal() + 
  labs(x = "Country", y = "Range of Subnational Trade Competitiveness") + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  #scale_fill_discrete("Measure:") + 
  #scale_colour_discrete("Measure:") + 
  ggh4x::facet_nested(type + measure ~., scales = "free_y", nest_line = T,
                      labeller = label_parsed) + 
  NULL


ggsave("output/Figure3.pdf",
       height = 17,
       width = 24, units = "cm")

# 1.3) Figure 4: Correlation between regional GNI per capita and subnational trade competitiveness ####

shdi_codes <- bind_rows(readxl::read_xlsx("./ancillary_data/shdi_subnational_codes.xlsx", sheet = "Level_1"),
                        readxl::read_xlsx("./ancillary_data/shdi_subnational_codes.xlsx", sheet = "Level_2")) %>%
  mutate(id = 1:nrow(.))

shdi_data <- read.csv("./not_shareable/SHDI Complete 4.0.csv") %>%
  #  #please download the SHDI data version 4 from this website. We cannot include it in the replication data. https://globaldatalab.org/shdi/download/
  filter(iso_code %in% stc$country_short)

shdi_data <- dplyr::left_join(shdi_codes, shdi_data, by = c(c("shdi_code" = "GDLCODE"), c("shdi_name" = "region"))) %>%
  dplyr::select(c("subnational_code", "year", "shdi", "gnic", "msch", "pop")) %>%
  group_by(subnational_code, year) %>%
  summarize(shdi = questionr::wtd.mean(x = shdi, weights = pop,  na.rm = T),
            gnic = questionr::wtd.mean(x = gnic, weights = pop,  na.rm = T),
            msch = questionr::wtd.mean(x = msch, weights = pop,  na.rm = T), 
            pop = sum(pop))
#We weight by population because several states may constitute one region. E.g. West US is then mostly California.

wb <- wbstats::wb(indicator = "NY.GNP.PCAP.PP.CD",
                  country = unique(stc$country_short), 
                  startdate = 1999, enddate = 2019) %>%
  dplyr::rename(country_short = iso3c,
                GNPpc = value, 
                year = date) %>%
  mutate(year = as.numeric(as.character(year))) %>%
  dplyr::select(country_short, year, GNPpc)

dat <- dplyr::left_join(stc, shdi_data, by = c("subnational_code", "year")) %>%
  dplyr::left_join(., wb, by = c("country_short", "year")) %>%
  filter(nchar(country_short) == 3) %>%
  mutate(GNPpc = GNPpc/10^3,
         lgnic = log(gnic))

df_cor <- data.frame(cntry = as.character(),
                     cor = as.numeric(),
                     measure = as.character())

cntry <- unique(dat$country_short)[c(-15, -57)] # remove CRI and TZA

for (i in cntry){
  
  temp_dat <- dat %>% filter(country_short == i & aggregation_level == 1)
  
  df_cor <- rbind(df_cor, 
                  data.frame(cntry = rep(i,4),
                             cor = c(cor(temp_dat$lgnic, temp_dat$stc_sym, use = "complete.obs"),
                                     cor(temp_dat$lgnic, temp_dat$stc_add, use = "complete.obs"),
                                     cor(temp_dat$lgnic, temp_dat$stc_net, use = "complete.obs"),
                                     cor(temp_dat$lgnic, temp_dat$stc_tba, use = "complete.obs")),
                             measure = factor(x = c("stc_sym", "stc_add","stc_net", "stc_tba"),
                                              levels = c("stc_sym", "stc_add",
                                                         "stc_net", "stc_tba"))))
  
}

dplyr::left_join(df_cor, df_cor %>%
                   group_by(cntry) %>%
                   summarise(sum_cor = sum(cor)),
                 "cntry") %>% 
  dplyr::left_join(., wb %>%
                     group_by(country_short) %>%
                     summarise(mean_gnp = mean(GNPpc, na.rm=T)),
                   by = c('cntry' = "country_short")) %>%
  mutate(type = factor(ifelse(measure %in% c("stc_sym", "stc_add"), "Exports", "Trade~ Balance")),
         measure= factor(measure, 
                         levels = c("stc_sym", "stc_add",
                                    "stc_net", "stc_tba"),
                         labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance))))) %>%
  ggplot(., aes(x=reorder(cntry, mean_gnp), y = cor)) + 
  geom_hline(yintercept = 0) + 
  geom_bar(stat = "identity", alpha = .75, position = position_dodge(), fill = "white", colour = "black") + 
  geom_smooth(aes(group=1), method = "loess", se = F, colour = "black") + 
  theme_minimal() + 
  labs(x = "\nCountry", y = "Correlation between\nRegional GNIpc and Subnational Trade Competitiveness\n") + 
  scale_fill_discrete("Measure:") + 
  scale_colour_discrete("Measure:") + 
  ggh4x::facet_nested(type + measure ~., nest_line = T,
                      labeller = label_parsed) + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  NULL

ggsave(filename = "output/Figure4.pdf",
       height = 18,
       width = 24, units = "cm")

# 1.4) Figure 5: Subnational trade competitiveness of South Korean regions over time ####

df_line <- readxl::read_excel("./ancillary_data/Coding_schemes_changes.xlsx", sheet = "Change_1") %>%
  mutate(year = as.numeric(as.character(year)))

df_line_temp <- df_line %>% filter(country_short == "KOR")

    stc %>% 
      filter(country_short == "KOR" & aggregation_level == 1 & grepl("-NAT", subnational_code) == F) %>%
      dplyr::select(subnational_name, year, stc_sym, stc_add, stc_net, stc_tba) %>%
      gather(stc_sym:stc_tba, key = "measure", value = "comp") %>%
      mutate(measure= factor(measure, 
                             levels = c("stc_sym", "stc_add",
                                        "stc_net", "stc_tba")),
             type = factor(ifelse(measure %in% c("stc_sym", "stc_add"), "Exports", "Trade~ Balance")),
             measure= factor(measure, 
                             levels = c("stc_sym", "stc_add",
                                        "stc_net", "stc_tba"),
                             labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance))))) %>%
      ggplot(., aes(x=year, y = comp, colour = subnational_name, shape = subnational_name)) +
      geom_point() + 
      geom_line() + 
      geom_hline(yintercept = 0, lty = "dotted") + 
      scale_shape_manual("",
                         values = c(0:as.numeric(stc %>%
                                                   filter(country_short == "KOR") %>%
                                                   group_by(subnational_name) %>%
                                                   n_groups())-1)) + 
      scale_colour_discrete("") + 
      ggh4x::facet_nested(type + measure ~., scales = "free_y", nest_line = T,
                          labeller = label_parsed) + 
      theme_minimal() + 
      labs(x = "Year", y = "Subnational Trade Competitiveness") + 
      scale_x_continuous(breaks = seq(from = 2000, to = 2020, by = 4), limits = c(1999, 2020)) + 
      theme(legend.position = "bottom") + 
      geom_vline(aes(xintercept = df_line_temp$year), lty = "dashed") +
      NULL
    
ggsave(filename =  "output/Figure5.pdf",
       height = 21,
       width = 19, units = "cm")

# 2) Appendix Section A: Time Trends ####

# 2.1) Figure A1: Distribution of time trends across sectors ####

library(funtimes)

tt <- left_join(stc %>%
                  select("country_short", "subnational_level", "subnational_code",
                         "year", "ISIC_coding", "estimated",  "distance",
                         "share_agri", "share_mini", "share_malt", "share_maht", "share_sr_t"),
                stc %>%
                  select("country_short", "subnational_level", "subnational_code",
                         "year", "ISIC_coding", "estimated",  "distance",
                         "share_agri", "share_mini", "share_malt", "share_maht", "share_sr_t") %>%
                  filter(estimated != 2) %>%
                  group_by(country_short, subnational_level, subnational_code) %>%
                  summarise(share_estimated = mean(estimated < 1, na.rm=T),
                            n = n())) %>%
  filter(estimated != 2 & n > 10) %>%
  pivot_longer(., cols = share_agri:share_sr_t, names_to = "sector", values_to = "share") 

tt <- left_join(tt, tt %>%
                  group_by(country_short, subnational_level, subnational_code, sector) %>%
                  summarise(variance_share = var(share, na.rm=T))) %>%
  filter(variance_share > 0) %>%
  drop_na()

df_test <-expand.grid(subnational_code = unique(tt$subnational_code),
                      sector = unique(tt$sector))

pvalue <- as.numeric()

pb = txtProgressBar(min = 1, max = nrow(df_test), initial = 0, style = 3) 

set.seed(12345)

for (i in 1:nrow(df_test)){
  
  dat_temp <- tt %>% filter(subnational_code == df_test$subnational_code[i],
                            sector == df_test$sector[i])
  
  if(nrow(dat_temp) > 0){
    pvalue <- c(pvalue, notrend_test(dat_temp$share, test = "MK", ar.method = "yw", factor.length = "adaptive.selection", j = c(1:15))$p.value)
  } else{pvalue <- c(pvalue, NA)}
  
  setTxtProgressBar(pb,i)
  
}

df_test$pvalue <- NA

df_test$pvalue <- pvalue

df_test$sector <- factor(df_test$sector,
                         levels = c("share_agri", "share_mini", "share_malt", "share_maht", "share_sr_t"),
                         labels = c("Agriculture", "Mining", "Manu. (low-tech)",  "Manu. (high-tech)", "Services"))



ggplot(df_test, aes(x=pvalue)) + 
  geom_histogram(fill = "white", colour = "black") + 
  facet_grid(sector~.) +
  theme_minimal() +
  labs(x = "p value", y = "Count")

ggsave(filename = "output/FigureA1.pdf",
       width = 15,
       height = 15,
       units = "cm")

round(prop.table(table(df_test$pvalue < 0.05)),4)
round(prop.table(table(df_test$sector, df_test$pvalue < 0.05), 1),4)

# 3) Appendix Section B: Available countries and years ####

# 3.1) Table B1: Summary of available countries ####

sources_temp <- readxl::read_excel("./ancillary_data/sources_temp.xlsx")
keys <- dplyr::left_join(readRDS("./ancillary_data/keys.rds"), sources_temp, by = "country_short")

print(xtable::xtable(keys %>%
                       mutate(country = countrycode::countrycode(country_short, origin = "iso3c", destination = "country.name"),
                              regions = paste(subnational_units, subnational_level),
                              survey_type = car::recode(survey_type, 
                                                        "'census' = 'Census'; 'household_survey' = 'Household'; 'labor_survey' = 'Labor'")) %>%
                       arrange(., country) %>%
                       dplyr::select(Country = country,
                                     Years = years,
                                     Regions = regions,
                                     'Coding scheme' = coding_scheme,
                                     'Survey type' = survey_type,
                                     Source = citation), 
                     digits = 0, 
                     label = "tab:summary_table",
                     caption = "Summary of available countries"), 
      include.rownames = F,
      type="latex", file= "output/TableB1.tex", label = "tab:summary_table", 
      # width = "\\pagewidth",
      sanitize.text.function = function(x) {x},
      tabular.environment = "longtable") 

# 3.2) Figure B1: Available countries ####

dat_map <- stc %>%
  filter(distance == 0) %>% 
  group_by(country_short) %>%
  summarise(Years = length(unique(year))) %>%
  mutate(country = countrycode::countrycode(country_short, origin = "iso3c", destination = "country.name"),
         region = car::recode(country, 
                              "'United Kingdom' = 'UK'; 'United States' = 'USA'; 
                              'Palestinian Territories' = 'Palestine'; 'Czechia' = 'Czech Republic'")) %>%
  left_join(map_data("world"), ., by = "region") 

unique(stc$country_short[stc$country_short %!in% dat_map$country_short]) # Missing Countries

ggplot(dat_map, aes(x = long, y = lat, group = group)) + 
  geom_polygon(aes(fill = Years), colour = "white", size = .01) + 
  scale_fill_gradient(low = "lightblue", high = "darkblue", na.value = "lightgray") +
  ggthemes::theme_tufte() +
  theme(axis.title = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "bottom") + 
  NULL

ggsave("output/FigureB1.pdf",
       height = 15,
       width = 30, units = "cm")

# 4) Appendix Section C: Regional gross value added ####

# 4.1) Figure C1: Comparison of labour surveys-based and GVA-based STC scores in the United Kingdom #####

ISIC_schemes <- list(ISIC3_names = readxl::read_excel("./ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC3_names"),
                     ISIC31_names = readxl::read_excel("./ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC31_names"),
                     ISIC4_names = readxl::read_excel("./ancillary_data/ISIC_schemes.xlsx", sheet = "ISIC4_names"))

# Sector limits
sectors_gbr <- readxl::read_excel("./ancillary_data/industry_groups.xlsx", na = "NA", sheet = "totl") %>%
  filter(ISIC_coding == "ISIC4_division") %>%
  mutate(ISIC4d_code = car::recode(ISIC_code, 
                                   "5:8 = '5-8'; 11:12 = '11-12'; 19:20 = '19-20'; 36:37 = '36-37'; 97:98 = '97-98'")) %>%
  dplyr::select(ISIC4d_code, sector) %>%
  distinct()

# Great Britain GVA data (in million pounds)
# for legal reasons, we cannot share the GVA data. Data is available here: https://www.ons.gov.uk/economy/grossvalueaddedgva/datasets/nominalandrealregionalgrossvalueaddedbalancedbyindustry
# We use the version from '11 June 2021 10:31' here.

gbr_gva <- readxl::read_excel("./not_shareable/GBR_regionalgrossvalueaddedbalancedbyindustryallitlregions-1.xlsx", sheet = "Table1b", range = "A2:Z1858") %>%
  rename_all(make.names) %>%
  filter(SIC07.code %in% c(1:99, "11-12", "19-20", "36-37", "5-8", "97-98"))%>%
  filter(nchar(ITL.region.code) == 3 & ITL.region.code %!in% c("TLB", "UKX", "TLZ")) %>%
  rename(subnational_code = ITL.region.code,
         subnational_name = ITL.region.name,
         ISIC4d_code = SIC07.code,
         ISIC4d_name = SIC07.description,
         X2019 = X2019...note.3.) %>%
  pivot_longer(names_to = "year", values_to = "gva", cols = X1998:X2019) %>%
  mutate(year = as.numeric(as.character(substr(year, 2, 5))),
         subnational_code = gsub(x = subnational_code, pattern = "TL", replacement = "GB-UK"))

# great britain labour data
gbr_lab <- readRDS(file = "./not_shareable/gbr_nuts_1.rds") %>% #again, data cannot be shared
  mutate(isic4d_code = as.numeric(as.character(substr(ISIC4_class, 1, nchar(ISIC4_class)-2))),
         isic3d_code = as.numeric(as.character(substr(ISIC3_class, 1, nchar(ISIC3_class)-2))),
         isic4d_code = car::recode(isic4d_code,
                                   "5:8 = '5-8'; 11:12 = 'm11-12'; 19:20 = '19-20'; 36:37 = '36-37'; 97:98 = '97-98'")) %>%
  filter(!is.na(isic4d_code)) %>% # before 2009, labour data is coded in isic 3. transformation into isic 4 division possible but maybe not worth it?
  group_by(subnational_code, subnational_name, year, isic4d_code) %>%
  summarise(count = sum(count, na.rm = t),
            weight = sum(weight, na.rm = t))

# RCA Data summarized on GBR specific division level
rtc_gbr <- readRDS(file = "./ancillary_data/rtc_gbr.rds") #this data is a subset of the stc_raw data, which is slightly aggregated and published in this data version.

gbr_gva_calc <- dplyr::left_join(gbr_gva, rtc_gbr %>% ungroup() %>% dplyr::select(year, ISIC_code, stc_sym, stc_add, stc_net, stc_tba), by = c("year", "ISIC4d_code" = "ISIC_code")) %>%
  mutate(stc_sym = ifelse(stc_sym %in% c(-Inf, Inf), NA, stc_sym),
         stc_add = ifelse(stc_add %in% c(-Inf, Inf), NA, stc_add),
         stc_net = ifelse(stc_net %in% c(-Inf, Inf), NA, stc_net),
         stc_tba = ifelse(stc_tba %in% c(-Inf, Inf), NA, stc_tba)) %>%
  dplyr::left_join(., sectors_gbr, by = "ISIC4d_code") 

gbr_lab_calc <- dplyr::left_join(gbr_lab, rtc_gbr %>% ungroup() %>% dplyr::select(year, ISIC_code, stc_sym, stc_add, stc_net, stc_tba), by = c("year", "isic4d_code" = "ISIC_code")) %>%
  mutate(stc_sym = ifelse(stc_sym %in% c(-Inf, Inf), NA, stc_sym),
         stc_add = ifelse(stc_add %in% c(-Inf, Inf), NA, stc_add),
         stc_net = ifelse(stc_net %in% c(-Inf, Inf), NA, stc_net),
         stc_tba = ifelse(stc_tba %in% c(-Inf, Inf), NA, stc_tba)) %>%
  dplyr::left_join(., sectors_gbr, by = c("isic4d_code" = "ISIC4d_code"))

# Aggregate to Sector Level

gbr_gva_country <- gbr_gva_calc  %>%
  group_by(year) %>%
  summarise(stc_sym_trad_nat = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_nat = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_nat = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_nat = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T))

gbr_gva_subnational <- gbr_gva_calc  %>%
  group_by(subnational_code, subnational_name, year) %>%
  summarise(stc_sym_trad_sub = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_sub = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_sub = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_sub = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T))

gbr_gva_combined <- dplyr::left_join(gbr_gva_subnational, gbr_gva_country, by = "year") %>%
  mutate(stc_sym_gva = stc_sym_trad_sub - stc_sym_trad_nat, 
         stc_add_gva = stc_add_trad_sub - stc_add_trad_nat,
         stc_net_gva = stc_net_trad_sub - stc_net_trad_nat,
         stc_tba_gva = stc_tba_trad_sub - stc_tba_trad_nat
  ) %>%
  ungroup() %>%
  dplyr::select(subnational_code, year, contains("gva"))

gbr_lab_country <- gbr_lab_calc  %>%
  group_by(year) %>%
  summarise(stc_sym_trad_nat = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_nat = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_nat = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_nat = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T)
  )

gbr_lab_subnational <- gbr_lab_calc  %>%
  group_by(subnational_code, subnational_name, year) %>%
  summarise(stc_sym_trad_sub = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_sub = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_sub = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_sub = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T)
  )

gbr_lab_combined <- dplyr::left_join(gbr_lab_subnational, gbr_lab_country, by = "year") %>%
  mutate(stc_sym_lab = stc_sym_trad_sub - stc_sym_trad_nat, 
         stc_add_lab = stc_add_trad_sub - stc_add_trad_nat,
         stc_net_lab = stc_net_trad_sub - stc_net_trad_nat,
         stc_tba_lab = stc_tba_trad_sub - stc_tba_trad_nat
  ) %>%
  ungroup() %>%
  dplyr::select(subnational_code, subnational_name, year, contains("_lab"))

# Revealed trade competitiveness
gbr_comparison <- dplyr::left_join(gbr_lab_combined, gbr_gva_combined, by = c("subnational_code", "year"))

cor.test(gbr_comparison$stc_sym_lab, gbr_comparison$stc_sym_gva)
cor.test(gbr_comparison$stc_add_lab, gbr_comparison$stc_add_gva)
cor.test(gbr_comparison$stc_net_lab, gbr_comparison$stc_net_gva)
cor.test(gbr_comparison$stc_tba_lab, gbr_comparison$stc_tba_gva)

rbind(gbr_comparison %>%
        select(subnational_code, subnational_name, year, contains("sym")) %>%
        rename(lab = stc_sym_lab, 
               gva = stc_sym_gva) %>%
        mutate(measure = "sym"),
      gbr_comparison %>%
        select(subnational_code, subnational_name, year, contains("add")) %>%
        rename(lab = stc_add_lab, 
               gva = stc_add_gva) %>%
        mutate(measure = "add"),
      gbr_comparison %>%
        select(subnational_code, subnational_name, year, contains("net")) %>%
        rename(lab = stc_net_lab, 
               gva = stc_net_gva) %>%
        mutate(measure = "net"),
      gbr_comparison %>%
        select(subnational_code, subnational_name, year, contains("tba")) %>%
        rename(lab = stc_tba_lab, 
               gva = stc_tba_gva) %>%
        mutate(measure = "tba")) %>%
  mutate(measure = factor(measure, 
                          levels = c("sym", "add", "net", "tba"),
                          labels = c("STC (symmetric)", "STC (additive)",
                                     "STC (net)", "STC (trade balance)"))) %>%
  ggplot(., aes(x = lab, y = gva)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_vline(xintercept = 0, lty = "dotted") + 
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm", colour = "black", se = F) + 
  facet_wrap(.~measure, scales = "free") +
  theme_minimal() + 
  labs(x = "Labour survey-based STC", y = "GVA-based STC")

ggsave(filename = "output/FigureC1.pdf",
       width = 15, height = 8, units = "cm")  

# 4.2) Figure C2: Comparison of labour surveys-based and GVA-based STC scores in Ecuador ####


ecu_correspondence <- list(readxl::read_excel("ancillary_data/ECU_gva_correspondence.xlsx", sheet = "ISIC31"),
                           readxl::read_excel("ancillary_data/ECU_gva_correspondence.xlsx", sheet = "ISIC4"))
names(ecu_correspondence) <- c("ISIC31", "ISIC4")

# VALOR AGREGADO BRUTO PROVINCIAL POR INDUSTRIA (Miles de d?lares)
# The data is available through: https://contenido.bce.fin.ec/documentos/Estadisticas/SectorReal/CuentasProvinciales/Indice.htm
ecu_gva <- bind_rows(readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2007", skip = 5) %>% mutate(year = 2007),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2008", skip = 5) %>% mutate(year = 2008),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2009", skip = 5) %>% mutate(year = 2009),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2010", skip = 5) %>% mutate(year = 2010),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2011", skip = 5) %>% mutate(year = 2011),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2012", skip = 5) %>% mutate(year = 2012),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2013", skip = 5) %>% mutate(year = 2013),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2014", skip = 5) %>% mutate(year = 2014),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2015", skip = 5) %>% mutate(year = 2015),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2016", skip = 5) %>% mutate(year = 2016),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2017", skip = 5) %>% mutate(year = 2017),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2018", skip = 5) %>% mutate(year = 2018),
                     readxl::read_excel("./not_shareable/ECU_CtasProv2007-2019_clean.xlsx", sheet = "2019", skip = 5) %>% mutate(year = 2019)) %>%
  filter(!is.na(...1)) %>%
  dplyr::select(-c("...1", "ECONOM\u00cdA TOTAL")) %>%
  rename(subnational_name = ...2) %>%
  pivot_longer(names_to = "ISIC_name", values_to = "gva", cols = -c(subnational_name, year)) %>%
  mutate(ISIC_name = car::recode(ISIC_name,
                                 "'Acuicultura y pesca de camar?n' = 'Pesca y acuicultura';
                                 'Pesca y acuicultura (excepto de camar?n)' = 'Pesca y acuicultura';
                                 'Procesamiento y conservaci?n de camar?n' = 'Elaboraci?n y conservaci?n de pescado, crust?ceos y molusc';
                                 'Procesamiento y conservaci?n de pescado y otros productos acu?ticos' = 'Elaboraci?n y conservaci?n de pescado, crust?ceos y molusc'"),
         subnational_name = paste0(substr(subnational_name, 1, 1), tolower(substr(subnational_name, 2, nchar(subnational_name)))),
         subnational_name = car::recode(subnational_name, "'El oro' = 'El Oro'; 'Los rios' = 'Los R?os'; 'Santo domingo' = 'Santo Domingo de los Ts?chilas';
                                        'Morona santiago' = 'Morona Santiago'; 'Santa elena' = 'Santa Elena'; 'Zamora chinchipe' = 'Zamora Chinchipe';
                                        'Bolivar' = 'Bol?var'; 'Galapagos' = 'Gal?pagos'; 'Manabi' = 'Manab?'; 'Sucumbios' = 'Sucumb?os'")) %>%
  group_by(subnational_name, year, ISIC_name) %>%
  summarize(gva = sum(gva, na.rm = T))

# Ecuador Labour data
ecu_lab <- readRDS(file = "./not_shareable/ecu_provinces.rds")
# please consult Table A1 in the Appendix on the survey we use. We cannot share it for legal reasons.

ecu_lab <- bind_rows(ecu_lab %>% filter(!is.na(ISIC31_class)) %>% dplyr::left_join(., ecu_correspondence[["ISIC31"]], by = "ISIC31_class"),
                     ecu_lab %>% filter(!is.na(ISIC4_class)) %>% dplyr::left_join(., ecu_correspondence[["ISIC4"]], by = "ISIC4_class")) %>%
  group_by(subnational_code, subnational_name, year, ISIC_name) %>%
  summarize(count = sum(count, na.rm = T),
            weight = sum(weight, na.rm = T))

# RCA Data summarized on ECU specific level
rtc_ecu <- readRDS(file = "./ancillary_data/rtc_ecu.rds")

# Ecuador calculations
sectors_ecu <- readxl::read_excel("./ancillary_data/ECU_gva_correspondence.xlsx", sheet = "sectors")

ecu_gva_calc <- dplyr::left_join(ecu_gva, rtc_ecu %>% ungroup() %>% dplyr::select(year, ISIC_name, contains("stc_")), by = c("year", "ISIC_name")) %>%
  mutate(stc_sym = ifelse(stc_sym %in% c(-Inf, Inf), NA, stc_sym),
         stc_add = ifelse(stc_add %in% c(-Inf, Inf), NA, stc_add),
         stc_net = ifelse(stc_net %in% c(-Inf, Inf), NA, stc_net),
         stc_tba = ifelse(stc_tba %in% c(-Inf, Inf), NA, stc_tba)) %>%
  dplyr::left_join(., sectors_ecu, by = "ISIC_name") 

ecu_lab_calc <- dplyr::left_join(ecu_lab, rtc_ecu %>%
                                   ungroup() %>%
                                   dplyr::select(year, ISIC_name, contains("stc_")), 
                                 by = c("year", "ISIC_name")) %>%
  mutate(stc_sym = ifelse(stc_sym %in% c(-Inf, Inf), NA, stc_sym),
         stc_add = ifelse(stc_add %in% c(-Inf, Inf), NA, stc_add),
         stc_net = ifelse(stc_net %in% c(-Inf, Inf), NA, stc_net),
         stc_tba = ifelse(stc_tba %in% c(-Inf, Inf), NA, stc_tba))  %>%
  dplyr::left_join(., sectors_ecu, by = "ISIC_name")

# Aggregate to Sector Level

ecu_gva_country <- ecu_gva_calc  %>%
  group_by(year) %>%
  summarise(stc_sym_trad_nat = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_nat = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_nat = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_nat = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
  )

ecu_gva_subnational <- ecu_gva_calc  %>%
  group_by(subnational_name, year) %>%
  summarise(stc_sym_trad_sub = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_sub = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_sub = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_sub = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = gva[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T)
  )

ecu_gva_combined <- dplyr::left_join(ecu_gva_subnational, ecu_gva_country, by = "year") %>%
  mutate(stc_sym_gva = stc_sym_trad_sub - stc_sym_trad_nat, 
         stc_add_gva = stc_add_trad_sub - stc_add_trad_nat,
         stc_net_gva = stc_net_trad_sub - stc_net_trad_nat,
         stc_tba_gva = stc_tba_trad_sub - stc_tba_trad_nat
  ) %>%
  ungroup() %>%
  dplyr::select(subnational_name, year, contains("_gva"))

ecu_lab_country <- ecu_lab_calc  %>%
  group_by(year) %>%
  summarise(stc_sym_trad_nat = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_nat = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_nat = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_nat = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T)
  )

ecu_lab_subnational <- ecu_lab_calc  %>%
  group_by(subnational_code, subnational_name, year) %>%
  summarise(stc_sym_trad_sub = questionr::wtd.mean(x = stc_sym[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_add_trad_sub = questionr::wtd.mean(x = stc_add[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_net_trad_sub = questionr::wtd.mean(x = stc_net[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T),
            stc_tba_trad_sub = questionr::wtd.mean(x = stc_tba[sector %in% c("agri", "mini", "manu", "sr_t")], weights = weight[sector %in% c("agri", "mini", "manu", "sr_t")],  na.rm = T)
  )

ecu_lab_combined <- dplyr::left_join(ecu_lab_subnational, ecu_lab_country, by = "year") %>%
  mutate(stc_sym_lab = stc_sym_trad_sub - stc_sym_trad_nat, 
         stc_add_lab = stc_add_trad_sub - stc_add_trad_nat,
         stc_net_lab = stc_net_trad_sub - stc_net_trad_nat,
         stc_tba_lab = stc_tba_trad_sub - stc_tba_trad_nat
  ) %>%
  ungroup() %>%
  dplyr::select(subnational_code, subnational_name, year, contains("lab"))

# Ecuador comparison

# Revealed trade competitiveness
ecu_comparison <- dplyr::left_join(ecu_lab_combined, ecu_gva_combined, by = c("subnational_name", "year"))

cor.test(ecu_comparison$stc_sym_lab, ecu_comparison$stc_sym_gva)
cor.test(ecu_comparison$stc_add_lab, ecu_comparison$stc_add_gva)
cor.test(ecu_comparison$stc_net_lab, ecu_comparison$stc_net_gva)
cor.test(ecu_comparison$stc_tba_lab, ecu_comparison$stc_tba_gva)

rbind(ecu_comparison %>%
        select(subnational_code, subnational_name, year, contains("sym")) %>%
        rename(lab = stc_sym_lab, 
               gva = stc_sym_gva) %>%
        mutate(measure = "sym"),
      ecu_comparison %>%
        select(subnational_code, subnational_name, year, contains("add")) %>%
        rename(lab = stc_add_lab, 
               gva = stc_add_gva) %>%
        mutate(measure = "add"),
      ecu_comparison %>%
        select(subnational_code, subnational_name, year, contains("net")) %>%
        rename(lab = stc_net_lab, 
               gva = stc_net_gva) %>%
        mutate(measure = "net"),
      ecu_comparison %>%
        select(subnational_code, subnational_name, year, contains("tba")) %>%
        rename(lab = stc_tba_lab, 
               gva = stc_tba_gva) %>%
        mutate(measure = "tba")) %>%
  mutate(measure = factor(measure, 
                          levels = c("sym", "add", "net", "tba"),
                          labels = c("STC (symmetric)", "STC (additive)",
                                     "STC (net)", "STC (trade balance)"))) %>%
  ggplot(., aes(x = lab, y = gva)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_vline(xintercept = 0, lty = "dotted") + 
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm", colour = "black", se = F) + 
  facet_wrap(.~measure, scales = "free") +
  theme_minimal() + 
  labs(x = "Labour survey-based STC", y = "GVA-based STC")

ggsave(filename = "output/FigureC2.pdf",
       width = 15, height = 8, units = "cm")  

# 4.3) Figure C3: Comparison of labour surveys-based and GVA-based weights in the United Kingdom ####

# Weights
gbr_weights <- dplyr::left_join(gbr_lab_calc %>% dplyr::select(subnational_code, subnational_name, year, ISIC4d_code = isic4d_code, weight, sector), 
                                gbr_gva_calc %>% dplyr::select(subnational_code, year, ISIC4d_code, ISIC4d_name, gva), 
                                by = c("subnational_code", "year", "ISIC4d_code"))

gbr_weights <- dplyr::left_join(gbr_weights, 
                                gbr_weights %>% group_by(subnational_code, year) %>% summarize(weight_sum = sum(weight, na.rm = T), gva_sum = sum(gva, na.rm = T)),
                                by = c("subnational_code", "year")) %>%
  mutate(weight_share = weight / weight_sum,
         gva_share = gva / gva_sum,
         share_diff = weight_share - gva_share)

cor.test(gbr_weights$weight_share, gbr_weights$gva_share)

ggplot(gbr_weights, aes(x = weight_share, y = gva_share)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_vline(xintercept = 0, lty = "dotted") + 
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm", colour = "black", se = F) + 
  theme_minimal() + 
  labs(x = "Labour survey-based weight", y = "GVA-based weight")

ggsave(filename = "output/FigureC3.pdf",
       width = 15, height = 8, units = "cm")  

# 4.4) Figure C4: Comparison of labour surveys-based and GVA-based weights in Ecuador ####

ecu_weights <- dplyr::left_join(ecu_lab_calc %>% dplyr::select(subnational_code, subnational_name, year, ISIC_name, weight, sector), 
                                ecu_gva_calc %>% dplyr::select(subnational_name, year, ISIC_name, gva), 
                                by = c("subnational_name", "year", "ISIC_name"))

ecu_weights <- dplyr::left_join(ecu_weights, 
                                ecu_weights %>% group_by(subnational_code, year) %>% summarize(weight_sum = sum(weight, na.rm = T), gva_sum = sum(gva, na.rm = T)),
                                by = c("subnational_code", "year")) %>%
  mutate(weight_share = weight / weight_sum,
         gva_share = gva / gva_sum,
         share_diff = weight_share - gva_share)

cor.test(ecu_weights$weight_share, ecu_weights$gva_share)

ggplot(ecu_weights, aes(x = weight_share, y = gva_share)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_vline(xintercept = 0, lty = "dotted") + 
  geom_point(alpha = .2) + 
  geom_smooth(method = "lm", colour = "black", se = F) + 
  theme_minimal() + 
  labs(x = "Labour survey-based weight", y = "GVA-based weight")

ggsave(filename = "output/FigureC4.pdf",
       width = 15, height = 8, units = "cm")  

# 5) Appendix Section D: Data quality checks ####

setwd("./not_shareable/surveys/")
outFiles <- list.files(pattern = "*.rds")
outFiles <- outFiles[-which(outFiles == "keys.rds")]
outList <- lapply(outFiles, readRDS)
setwd("./../..")
dat <- dplyr::bind_rows(outList) %>%
  filter(nchar(country_short) == 3)

# 5.1) Figure D1: Employment shares by national industry sector ####

ilo <- haven::read_sav("./not_shareable/ilostat.sav") %>%
  #we cannot share the data but it is available via: https://ilostat.ilo.org/topics/employment/#.
  #any dataset providing information by economic activity is usable. We used: "Employment by Sex and Economic Activity"
  dplyr::select(country_short = ref_area, 
                year = time,
                sector = classif1,
                obs_ilo = obs_value) %>%
  filter(sector %in% c("ECO_AGGREGATE_AGR", "ECO_AGGREGATE_MEL", "ECO_AGGREGATE_MAN", "ECO_AGGREGATE_CON", "ECO_AGGREGATE_MKT", "ECO_AGGREGATE_PUB")) %>%
  mutate(sector = car::recode(sector, "'ECO_AGGREGATE_AGR' = 'Agriculture'; 'ECO_AGGREGATE_MEL' = 'Mining_Utility'; 'ECO_AGGREGATE_MAN' = 'Manufacturing';
                              'ECO_AGGREGATE_CON' = 'Construction'; 'ECO_AGGREGATE_MKT' = 'Services'; 'ECO_AGGREGATE_PUB' = 'Public_Community'")) %>%
  group_by(country_short, year) %>%
  mutate(share_ilo = obs_ilo/sum(obs_ilo))

dat_shares <- dat  %>%
  mutate(year = as.character(year),
         sector = car::recode(ISIC3_class, "1:599 = 'Agriculture'; 1000:1499 = 'Mining_Utility'; 1500:3799 = 'Manufacturing'; 4000:4199 = 'Mining_Utility'; 4500:4599 = 'Construction'; 5000:7499 = 'Services'; 7500:9900 = 'Public_Community'"),
         sector = ifelse(is.na(ISIC3_group), sector, 
                         car::recode(ISIC3_group, "1:59 = 'Agriculture'; 100:149 = 'Mining_Utility'; 150:379 = 'Manufacturing'; 400:419 = 'Mining_Utility'; 450:459 = 'Construction'; 500:749 = 'Services'; 750:990 = 'Public_Community'")),
         sector = ifelse(is.na(ISIC31_class), sector, 
                         car::recode(ISIC31_class, "1:599 = 'Agriculture'; 1000:1499 = 'Mining_Utility'; 1500:3799 = 'Manufacturing'; 4000:4199 = 'Mining_Utility'; 4500:4599 = 'Construction'; 5000:7499 = 'Services'; 7500:9900 = 'Public_Community'")),
         sector = ifelse(is.na(ISIC31_group), sector, 
                         car::recode(ISIC31_group, "1:59 = 'Agriculture'; 100:149 = 'Mining_Utility'; 150:379 = 'Manufacturing'; 400:419 = 'Mining_Utility'; 450:459 = 'Construction'; 500:749 = 'Services'; 750:990 = 'Public_Community'")),
         sector = ifelse(is.na(ISIC4_class), sector, 
                         car::recode(ISIC4_class, "1:399 = 'Agriculture'; 500:999 = 'Mining_Utility'; 1000:3399 = 'Manufacturing'; 3500:3999 = 'Mining_Utility'; 4100:4399 = 'Construction'; 4500:8299 = 'Services'; 8400:9900 = 'Public_Community'")),
         sector = ifelse(is.na(ISIC4_group), sector, 
                         car::recode(ISIC4_group, "1:39 = 'Agriculture'; 50:99 = 'Mining_Utility'; 100:339 = 'Manufacturing'; 350:399 = 'Mining_Utility'; 410:439 = 'Construction'; 450:829 = 'Services'; 840:990 = 'Public_Community'"))) %>%
  group_by(country_short, year, sector) %>%
  summarize(obs_survey = sum(weight)) %>%
  drop_na() %>%
  mutate(share_survey = obs_survey/sum(obs_survey)) %>%
  dplyr::left_join(., ilo, by = c("country_short", "year", "sector")) %>%
  mutate(difference = share_survey - share_ilo)

# Big Problems
subset(dat_shares, difference > 0.05 | difference < -0.05)
# Small Problems
problems_shares <- subset(dat_shares, difference > 0.01 | difference < -0.01)

cor.test(dat_shares$share_survey, dat_shares$share_ilo)
hist(dat_shares$difference)
ggplot(dat_shares, aes(x = country_short, y = difference)) +
  geom_boxplot() + 
  geom_hline(yintercept = 0, lty = "dashed") +
  ggthemes::theme_tufte() +
  labs(x = "Country", y = "Difference between Survey Share and ILO Share (in percentage points)") 

dat_shares %>%
  mutate(difference = ifelse(difference > .2, .2,
                             ifelse(difference < -.2, -.2, difference)),
         difference = difference * 100) %>%
  ggplot(., aes(x = country_short, y = difference)) +
  geom_boxplot() + 
  geom_hline(yintercept = 0, lty = "dashed") +
  ggthemes::theme_tufte() +
  labs(x = "Country", y = "Difference between Survey Share and ILO Share (in percentage points)") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ggsave(filename = "output/FigureD1.pdf",
       height = 15,
       width = 30, units = "cm")

# 5.2) Figure D2: Population shares by region ####

shdi <- read.csv("./not_shareable/SHDI Complete 4.0.csv") %>%
  #  #please download the SHDI data version 4 from this website. We cannot include it in the replication data. https://globaldatalab.org/shdi/download/
  filter(level != "National") %>%
  mutate(pop_shdi = pop * 1e6)

shdi_national <- shdi %>%
  group_by(country, year) %>%
  summarize(pop_national = sum(pop_shdi))

shdi <- dplyr::left_join(shdi, shdi_national, by = c("country", "year")) %>%
  mutate(pop_share_shdi = pop_shdi / pop_national)

dat_pop <- dat %>%
  dplyr::left_join(., readxl::read_xlsx("./ancillary_data/shdi_subnational_codes.xlsx", sheet = "Level_1"), by = c("country_short", "subnational_code")) %>%
  group_by(country_short, shdi_code, shdi_name, year) %>%
  summarise(pop_survey = sum(weight))

dat_national <- dat_pop %>%
  group_by(country_short, year) %>%
  summarize(pop_national = sum(pop_survey))

dat_pop <- dplyr::left_join(dat_pop, dat_national, by = c("country_short", "year")) %>%
  mutate(pop_share_survey = pop_survey / pop_national) %>%
  dplyr::left_join(., shdi[,c("year", "GDLCODE", "pop_shdi", "pop_share_shdi")], by = c("year", c("shdi_code" = "GDLCODE"))) %>%
  mutate(difference = (pop_share_survey - pop_share_shdi) * 100)

cor.test(dat_pop$pop_share_shdi, dat_pop$pop_share_survey)
hist(dat_pop$difference)
ggplot(dat_pop, aes(x = country_short, y = difference)) +
  geom_boxplot() + 
  geom_hline(yintercept = 0, lty = "dashed") +
  ggthemes::theme_tufte() +
  labs(x = "Country", y = "Difference between Survey Population and SHDI Population (in percentage points)") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ggsave(filename = "output/FigureD2.pdf",
       height = 15,
       width = 30, units = "cm")

# 5.3) Figure D3: Share of incorrect industry codes ####

readRDS("./ancillary_data/keys.rds") %>% 
  ggplot(., aes(x = country_short, y = check_missing)) +
  geom_bar(stat = "identity") +
  ggthemes::theme_tufte() +
  labs(x = "Country", y = "Share of unknown ISIC codes") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ggsave(filename = "output/FigureD3.pdf",
       height = 15,
       width = 30, units = "cm")

# 5.4) Figuer D4: Ratio of duplicated respondents ####

readRDS("./ancillary_data/keys.rds") %>% 
  ggplot(., aes(x = country_short, y = check_duplicates)) +
  geom_bar(stat = "identity") +
  ggthemes::theme_tufte() +
  labs(x = "Country", y = "Ratio of duplicated respondents") +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ggsave(filename = "output/FigureD4.pdf",
       height = 15,
       width = 30, units = "cm")

# 5.5) Figure D5: Share of regions with few workers ####

low_number <- dat %>%
  gather(-c(subnational_code, count, weight, country_short, year), key = "ISIC_coding", value = "ISIC_code") %>%
  dplyr::filter(!is.na(ISIC_code)) %>%
  group_by(subnational_code, country_short, year) %>%
  summarize(number = sum(count)) %>%
  mutate(threshold = ifelse(number < 10, "Less than 10", 
                            ifelse(number < 100, "Less than 100", 
                                   ifelse(number < 1000, "Less than 1000", NA))))

table(low_number$country_short, low_number$threshold)

low_number %>%
  group_by(country_short) %>%
  summarise(share_50 = (sum(number < 50, na.rm = T)/n()) * 100) %>%
  ggplot(., aes(x = country_short, y = share_50)) +
  geom_bar(stat = "identity") +
  labs(x = "Country", y = "Districts with less than 50 workers (in percent)") +
  ggthemes::theme_tufte() +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5))

ggsave(filename = "output/FigureD5.pdf",
       height = 15,
       width = 30, units = "cm")

# 6) Appendix Section E: Illustration using Austria ####

# 6.1) Table E1: Trade data and measures for Austria ####

cor_services <- list(readxl::read_xlsx("./ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-class", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-group", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC3-EBOPS_2002.xlsx", sheet = "ISIC3-division", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-class", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-group", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC31-EBOPS_2002.xlsx", sheet = "ISIC31-division", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-class", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-group", na = "NA"),
                     readxl::read_xlsx("./ancillary_data/ISIC4-EBOPS_2002.xlsx", sheet = "ISIC4-division", na = "NA"))
names(cor_services) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# correspondence tables for goods
load("./ancillary_data/correspondence_ISIC_to_HS.rds")

cor_goods_hs6 <- corr
rm(corr)

cor <- list()
for (i in 1:length(cor_goods_hs6)) {
  cor[[i]] <- cor_goods_hs6[[i]] %>%
    filter(ISIC_code <= c(4100, 410, 41, 4100, 410, 41, 3900, 390, 39)[i]) %>% # exclude services sectors
    bind_rows(., cor_services[[i]] %>%
                mutate(ISIC_code = ISIC_code) %>%
                filter(ISIC_code > c(4100, 410, 41, 4100, 410, 41, 3900, 390, 39)[i]) # exclude non-services sectors
    ) %>% 
    mutate(cor_code = ifelse(is.na(hs17), as.character(EBOPS_code), as.character(hs17))) %>%
    dplyr::select("ISIC_code", "cor_code")
}
names(cor) <- c("ISIC3c", "ISIC3g", "ISIC3d", "ISIC31c", "ISIC31g", "ISIC31d", "ISIC4c", "ISIC4g", "ISIC4d")

# cor <- cor[c("ISIC3g", "ISIC31g", "ISIC4g")] # since we only use group level data, we do not need the other correspondence tables

hs_codes_I3c <- unique(sort(cor[["ISIC3g"]]$cor_code))
hs_codes_I4c <- unique(sort(cor[["ISIC4g"]]$cor_code))

hs_codes_I4c[which(hs_codes_I4c %!in% hs_codes_I3c)]
#Note: ISIC 3c is more conservative, most differences are primary. Also for consistency reasons

trade_data <- readRDS("./not_shareable/trade_data.rds")
#for legal reasons, we cannot share the data. Thus, to obtain the data, scholars have to download trade data from Comtrade and Batis accordingly.
# The ancillary data "aut_example" contains the processed information for Austria.
# We left the code in the script but commented it out to make it understandable what we do.

# World Total Exports
world_total_ex <- trade_data %>%
  filter(flow == "Export", year == "2015") %>%
  group_by(year) %>%
  summarise(world_total_ex_hs = sum(value, na.rm=T))

# Country Total Exports
cntry_total_ex <- trade_data %>%
  filter(flow == "Export" & country_short == "AUT", year == "2015") %>%
  group_by(country_short, year) %>%
  summarise(cntry_total_ex_hs = sum(value, na.rm=T))

# World Total Import
world_total_im <- trade_data %>%
  filter(flow == "Import", year == "2015") %>%
  group_by(year) %>%
  summarise(world_total_im_hs = sum(value, na.rm=T))

# Country Total Import
cntry_total_im <- trade_data %>%
  filter(flow == "Import" & country_short == "AUT", year == "2015") %>%
  group_by(country_short, year) %>%
  summarise(cntry_total_im_hs = sum(value, na.rm=T))

# World Beverage Exports
world_beverage_ex <- dplyr::left_join(cor[["ISIC4g"]], trade_data, by = "cor_code") %>%
  filter(flow == "Export", country_short != "AUT", year == 2015, ISIC_code == 110) %>%
  group_by(ISIC_code, year) %>%
  summarise(value = sum(value, na.rm = T)) %>%
  rename(world_ISIC_ex_hs = "value")


# Country Beverage Exports
aut_beverages_ex <- dplyr::left_join(cor[["ISIC4g"]], trade_data, by = "cor_code") %>%
  filter(flow == "Export", country_short == "AUT", year == 2015, ISIC_code == 110) %>%
  group_by(ISIC_code, country_short, year) %>%
  summarise(cntry_ISIC_ex_hs = sum(value, na.rm=T))

# World Beverage Imports
world_beverage_im <- dplyr::left_join(cor[["ISIC4g"]], trade_data, by = "cor_code") %>%
  filter(flow == "Import", country_short != "AUT", year == 2015, ISIC_code == 110) %>%
  group_by(ISIC_code, year) %>%
  summarise(value = sum(value, na.rm = T)) %>%
  rename(world_ISIC_im_hs = "value")

# Country Beverage Imports
aut_beverages_im <- dplyr::left_join(cor[["ISIC4g"]], trade_data, by = "cor_code") %>%
  filter(flow == "Import", country_short == "AUT", year == 2015, ISIC_code == 110) %>%
  group_by(ISIC_code, country_short, year) %>%
  summarise(cntry_ISIC_im_hs = sum(value, na.rm=T))

aut_example <- aut_beverages_ex %>%
  dplyr::left_join(., aut_beverages_im, by = c("country_short", "year", "ISIC_code")) %>%
  dplyr::left_join(., cntry_total_ex, by = c("country_short", "year")) %>%
  dplyr::left_join(., cntry_total_im, by = c("country_short", "year")) %>%
  dplyr::left_join(., world_beverage_ex, by = c("ISIC_code", "year")) %>%
  dplyr::left_join(., world_beverage_im, by = c("ISIC_code", "year")) %>%
  dplyr::left_join(., world_total_ex, by = c("year")) %>%
  dplyr::left_join(., world_total_im, by = c("year")) %>%
  mutate(ex_share_country = cntry_ISIC_ex_hs / (cntry_total_ex_hs - cntry_ISIC_ex_hs),
         ex_share_world = world_ISIC_ex_hs / (world_total_ex_hs - world_ISIC_ex_hs),
         im_share_country = cntry_ISIC_im_hs / (cntry_total_im_hs - cntry_ISIC_im_hs),
         im_share_world = world_ISIC_im_hs / (world_total_im_hs - world_ISIC_im_hs),
         rxa_raw = ex_share_country/ex_share_world,
         rma_raw = im_share_country/im_share_world,
         stc_sym = (rxa_raw - 1) / (rxa_raw + 1),
         stc_add = (cntry_ISIC_ex_hs / cntry_total_ex_hs) - (world_ISIC_ex_hs / world_total_ex_hs),
         stc_net = (((rxa_raw -1 ) / (rxa_raw + 1)) - ((rma_raw - 1) / (rma_raw + 1))) / 2,
         stc_tba = (cntry_ISIC_ex_hs - cntry_ISIC_im_hs) / (cntry_ISIC_ex_hs + cntry_ISIC_im_hs))

print(xtable::xtable(aut_example %>%
                       ungroup() %>% 
                       select(cntry_ISIC_ex_hs:stc_tba) %>% 
                       pivot_longer(., cols = cntry_ISIC_ex_hs:stc_tba, names_to = "variable", values_to = "value") %>% 
                       mutate(value = ifelse(grepl(pattern = "_hs", variable), value / 10^6,
                                             ifelse(grepl(pattern = "_share", variable), value * 100, value)),
                              eq2 = c("X", "", "X", "","X", "", "X", "", "O", "O", rep("",8)),
                              eq3 = c(rep("", 12), "X", rep("", 5)),
                              eq4 = c("X", "", "X", "","X", "", "X", "", "O", "O", rep("",8)),
                              eq5 = c(rep("O", 12), rep("X",2), rep("",4)),
                              eq6 = c(rep("X",2), rep("", 16)))), include.rownames=FALSE)

# 6.2) Table E2: Labour Survey Data for ISIC Group 110 ####

aut_labour <- readRDS("./not_shareable/surveys/aut_states.rds") %>% 
  filter(year == "2015") %>% 
  mutate(ISIC4_group = substr(ISIC4_class, 1, nchar(ISIC4_class)-1)) %>% 
  filter(ISIC4_group == "110") %>% 
  group_by(subnational_code, subnational_name, year, ISIC4_group) %>% 
  summarise(count_beverages = sum(count, na.rm=T),
            weight_beverages = sum(weight, na.rm=T)) %>% 
  dplyr::left_join(., 
                   readRDS("./not_shareable/surveys/aut_states.rds") %>% 
                     filter(year == "2015") %>% 
                     mutate(ISIC4_group = substr(ISIC4_class, 1, nchar(ISIC4_class)-1)) %>% 
                     group_by(subnational_code, subnational_name, year) %>% 
                     summarise(count_total = sum(count, na.rm=T),
                               weight_total = sum(weight, na.rm=T)),
                   by = c("subnational_code", "subnational_name", "year")) %>% 
  mutate(count_share = count_beverages / count_total * 100,
         weight_share = weight_beverages / weight_total * 100)

print(xtable::xtable(aut_labour %>%
                       ungroup() %>% 
                       select(subnational_code, subnational_name, contains("share"))), include.rownames=FALSE)

# 7) Appendix Secion F: Additional Evidence ####

# 7.1) Figure F1: Correlation between the four measures ####

df_cor <- stc %>%
  ungroup() %>%
  select("stc_sym", "stc_add", "stc_net", "stc_tba") %>%
  drop_na()

data.frame(cor = c(cor(df_cor, use= "pairwise.complete.obs", method = "pearson")),
           x = rep(colnames(cor(df_cor, use= "pairwise.complete.obs", method = "pearson")),4),
           y = rep(rownames(cor(df_cor, use= "pairwise.complete.obs", method = "pearson")), each = 4)) %>%
  mutate(x = factor(x, 
                    levels = c("stc_sym", "stc_add",
                               "stc_net", "stc_tba")),
         y = factor(y, 
                    levels = c("stc_sym", "stc_add",
                               "stc_net", "stc_tba")),
         cor = ifelse(x == y, NA, cor),
         dot = ifelse(y == "stc_sym", cor,
                      ifelse(y == "stc_add" & x != "stc_sym", cor,
                             ifelse(y == "stc_net" & x == "stc_tba", cor, NA))),
         num = ifelse(x == "stc_sym", cor,
                      ifelse(x == "stc_add" & y != "stc_sym", cor,
                             ifelse(y == "stc_net" & x != "stc_tba", cor, 
                                    ifelse(y == "stc_tba", cor, NA)))),
         num = round(num, 2)) %>%
  ggplot(., aes(x=x, y=forcats::fct_rev(y))) + 
  geom_point(aes(size = dot, colour = dot)) +
  geom_text(aes(label = num), size = 8) +
  geom_abline(slope = -1, intercept = 5, size = 2) + 
  ggthemes::theme_tufte() +
  scale_size_continuous(range = c(10, 20)) +
  theme(legend.position = "none",
        text = element_text(size=16)) +
  labs(x = "", y = "") + 
  scale_x_discrete(labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance)))) +
  scale_y_discrete(labels = c(bquote(STC~ (trade~ balance)), bquote(STC~ (net)), bquote(STC~ (additive)), bquote(STC~ (symmetric)))) +
  NULL

ggsave(filename = "output/FigureF1.pdf",
       height = 16,
       width = 22,
       units = "cm")

# 7.2) Figure F2: Correlation between the four measures before aggregation with labour surveys####

dat_cor <- stc_raw %>%
  ungroup() %>%
  filter(estimated == 0) %>% # # use only true data and not estimated data (which is nearly identical to the non-estimated years it is based upon)
  select(country_short, subnational_level, subnational_code, subnational_name, year, contains("rca")) %>%
  drop_na()

cor_weight <- dat_cor %>%
  group_by(country_short) %>%
  summarize(country_freq = n()) %>%
  mutate(country_weight = 1/country_freq) %>% # to ensure that every country weighs equally heavily in the calculation regardless of the number of subnational units it has
  dplyr::left_join(dat_cor, ., by = "country_short")

dat_cor <- dat_cor %>%
  dplyr::select(contains("rca"))

test <- cov.wt(x = dat_cor, wt = cor_weight$country_weight, cor = TRUE)

data.frame(cor = c(cor(dat_cor, use= "pairwise.complete.obs", method = "pearson")),
           x = rep(colnames(cor(dat_cor, use= "pairwise.complete.obs", method = "pearson")),4),
           y = rep(rownames(cor(dat_cor, use= "pairwise.complete.obs", method = "pearson")), each = 4)) %>%
  mutate(x = factor(x, 
                    levels = c("rca_sym", "rca_add",
                               "rca_net", "rca_tba")),
         y = factor(y, 
                    levels = c("rca_sym", "rca_add",
                               "rca_net", "rca_tba")),
         cor = ifelse(x == y, NA, cor),
         dot = ifelse(y == "rca_sym", cor,
                      ifelse(y == "rca_add" & x != "rca_sym", cor,
                             ifelse(y == "rca_net" & x == "rca_tba", cor, NA))),
         num = ifelse(x == "rca_sym", cor,
                      ifelse(x == "rca_add" & y != "rca_sym", cor,
                             ifelse(y == "rca_net" & x != "rca_tba", cor, 
                                    ifelse(y == "rca_tba", cor, NA)))),
         num = round(num, 2)) %>%
  ggplot(., aes(x=x, y=forcats::fct_rev(y))) + 
  geom_point(aes(size = dot, colour = dot)) +
  geom_text(aes(label = num), size = 8) +
  geom_abline(slope = -1, intercept = 5, size = 2) + 
  ggthemes::theme_tufte() +
  scale_size_continuous(range = c(10, 20)) +
  theme(legend.position = "none",
        text = element_text(size=16)) +
  labs(x = "", y = "") +
  scale_x_discrete(labels = c(bquote(RCA~ (symmetric)), bquote(RCA~ (additive)), bquote(RCA~ (net)), bquote(RCA~ (trade~ balance)))) +
  scale_y_discrete(labels = c(bquote(RCA~ (trade~ balance)), bquote(RCA~ (net)), bquote(RCA~ (additive)), bquote(RCA~ (symmetric)))) +
  NULL

ggsave("output/FigureF2.pdf",
       height = 16,
       width = 22,
       units = "cm")

# 7.3) Figure F3: Sectoral competitiveness by region in South Korea ####


dplyr::left_join(stc %>% filter(country_short == "KOR", year == 2018) %>%
                   ungroup() %>%
                   dplyr::select(subnational_code, subnational_name, contains("stc")) %>%
                   filter(subnational_code %in% c("KR-11", "KR-48", "KR-31", "KR-45")) %>%
                   gather(stc_sym:stc_tba_sr_t, value = "value", key = "competitiveness") %>% 
                   mutate(sector = factor(ifelse(grepl("malt", competitiveness), "Manu. (low-tech)",
                                                 ifelse(grepl("maht", competitiveness), "Manu. (high-tech)",
                                                        ifelse(grepl("manu", competitiveness), "Manufacturing",
                                                               ifelse(grepl("mi", competitiveness), "Mining",
                                                                      ifelse(grepl("ag", competitiveness), "Agriculture",
                                                                             ifelse(grepl("sr", competitiveness), "Services", "Overall")))))),
                                          levels = c("Overall", "Agriculture", "Mining",  "Manu. (low-tech)", "Manu. (high-tech)", "Services")),
                          measure = factor(substr(competitiveness, 5,7),
                                           levels = c("sym", "add", "net", "tba")),
                          dst_lab = paste0(subnational_name, "\n(", subnational_code, ")"),
                          type = factor(ifelse(measure %in% c("sym", "add"), "Exports", "Trade~ Balance")),
                          measure= factor(measure, 
                                          levels = c("sym", "add", "net", "tba"),
                                          labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)[OV]), bquote(STC~ (trade~ balance))))),
                 stc %>% filter(country_short == "KOR", year == 2018) %>%
                   ungroup() %>%
                   dplyr::select(subnational_code, subnational_name, share_agri, share_mini, share_malt, share_maht, share_sr_t, share_trad) %>%
                   filter(subnational_code %in% c("KR-11", "KR-48", "KR-31", "KR-45")) %>%
                   gather(share_agri:share_trad, value = "value_share", key = "share") %>%
                   mutate(sector = factor(ifelse(grepl("trad", share), "Overall",
                                                 ifelse(grepl("malt", share), "Manu. (low-tech)",
                                                        ifelse(grepl("maht", share), "Manu. (high-tech)",
                                                               ifelse(grepl("manu", share), "Manufacturing",
                                                                      ifelse(grepl("mi", share), "Mining",
                                                                             ifelse(grepl("ag", share), "Agriculture",
                                                                                    ifelse(grepl("sr_t", share), "Services", NA))))))),
                                          levels = c("Overall", "Agriculture", "Mining", "Manu. (low-tech)", "Manu. (high-tech)", "Services"))),
                 by = c("subnational_code", "subnational_name", "sector")) %>% 
  mutate(dst_lab = gsub(" ", "~ ", dst_lab)) %>%
  filter(grepl("stc_tba", competitiveness) | grepl("stc_sym", competitiveness)) %>%
  ggplot(., aes(x = sector, y = value, size = value_share, colour = subnational_name)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_segment(aes(xend = sector, y = 0, yend = value), alpha = .2) +
  geom_point() + 
  ggh4x::facet_nested(type+measure~dst_lab, scale = "free_y", nest_line = T, labeller = label_parsed) + 
  labs(x = "\nSector", y = "Subnational Trade Competitiveness\n") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  scale_shape_manual("",
                     values = c(1, 4, 10)) +
  scale_colour_manual("", values = c("#00BFC4", "#00B4F0", "#C77CFF", "#FF64B0")) +
  scale_x_discrete(labels= function(x) highlight(x, "Overall", "black")) +
  theme(axis.text.x=element_markdown()) +
  theme(legend.position = "none") + 
  coord_cartesian(clip = "off") +
  NULL

ggsave("output/FigureF3.pdf",
       height = 20,
       width = 20, units = "cm")

# 7.4) Figure F4: Subnational trade competitiveness of Bolivian regions over time ####

df_line <- readxl::read_excel("./ancillary_data/Coding_schemes_changes.xlsx", sheet = "Change_1") %>%
  mutate(year = as.numeric(as.character(year)))

df_line_temp <- df_line %>% filter(country_short == "BOL")

stc %>% 
  filter(country_short == "BOL" & aggregation_level == 1 & grepl("-NAT", subnational_code) == F) %>%
  dplyr::select(subnational_name, year, stc_sym, stc_add, stc_net, stc_tba) %>%
  gather(stc_sym:stc_tba, key = "measure", value = "comp") %>%
  mutate(measure= factor(measure, 
                         levels = c("stc_sym", "stc_add",
                                    "stc_net", "stc_tba")),
         type = factor(ifelse(measure %in% c("stc_sym", "stc_add"), "Exports", "Trade~ Balance")),
         measure= factor(measure, 
                         levels = c("stc_sym", "stc_add",
                                    "stc_net", "stc_tba"),
                         labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance))))) %>%
  ggplot(., aes(x=year, y = comp, colour = subnational_name, shape = subnational_name)) +
  geom_point() + 
  geom_line() + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  scale_shape_manual("",
                     values = c(0:as.numeric(stc %>%
                                               filter(country_short == "BOL") %>%
                                               group_by(subnational_name) %>%
                                               n_groups())-1)) + 
  scale_colour_discrete("") + 
  ggh4x::facet_nested(type + measure ~., scales = "free_y", nest_line = T,
                      labeller = label_parsed) + 
  theme_minimal() + 
  labs(x = "Year", y = "Subnational Trade Competitiveness") + 
  scale_x_continuous(breaks = seq(from = 2000, to = 2020, by = 4), limits = c(1999, 2020)) + 
  theme(legend.position = "bottom") + 
  geom_vline(aes(xintercept = df_line_temp$year), lty = "dashed") +
  NULL

ggsave(filename =  "output/FigureF4.pdf",
       height = 21,
       width = 19, units = "cm")

# 7.5) Figure F5: Sectoral competitiveness by region in Bolivia ####

dplyr::left_join(stc %>% filter(country_short == "BOL", year == 2018) %>%
                   ungroup() %>%
                   dplyr::select(subnational_code, subnational_name, contains("stc")) %>%
                   filter(subnational_code %in% c("BO-C", "BO-P", "BO-S", "BO-L")) %>%
                   gather(stc_sym:stc_tba_sr_t, value = "value", key = "competitiveness") %>% 
                   mutate(sector = factor(ifelse(grepl("malt", competitiveness), "Manu. (low-tech)",
                                                 ifelse(grepl("maht", competitiveness), "Manu. (high-tech)",
                                                        ifelse(grepl("manu", competitiveness), "Manufacturing",
                                                               ifelse(grepl("mi", competitiveness), "Mining",
                                                                      ifelse(grepl("ag", competitiveness), "Agriculture",
                                                                             ifelse(grepl("sr", competitiveness), "Services", "Overall")))))),
                                          levels = c("Overall", "Agriculture", "Mining",  "Manu. (low-tech)", "Manu. (high-tech)", "Services")),
                          measure = factor(substr(competitiveness, 5,7),
                                           levels = c("sym", "add", "net", "tba")),
                          dst_lab = paste0(subnational_name, "\n(", subnational_code, ")"),
                          type = factor(ifelse(measure %in% c("sym", "add"), "Exports", "Trade~ Balance")),
                          measure= factor(measure, 
                                          levels = c("sym", "add", "net", "tba"),
                                          labels = c(bquote(STC~ (symmetric)), bquote(STC~ (additive)), bquote(STC~ (net)), bquote(STC~ (trade~ balance))))),
                 stc %>% filter(country_short == "BOL", year == 2018) %>%
                   ungroup() %>%
                   dplyr::select(subnational_code, subnational_name, share_trad, share_agri, share_mini, share_manu, share_malt, share_maht, share_sr_t) %>%
                   filter(subnational_code %in% c("BO-C", "BO-P", "BO-S", "BO-L")) %>%
                   gather(share_trad:share_sr_t, value = "value_share", key = "share") %>%
                   mutate(sector = factor(ifelse(grepl("trad", share), "Overall",
                                                 ifelse(grepl("malt", share), "Manu. (low-tech)",
                                                        ifelse(grepl("maht", share), "Manu. (high-tech)",
                                                               ifelse(grepl("manu", share), "Manufacturing",
                                                                      ifelse(grepl("mi", share), "Mining",
                                                                             ifelse(grepl("ag", share), "Agriculture",
                                                                                    ifelse(grepl("sr_t", share), "Services", NA))))))),
                                          levels = c("Overall", "Agriculture", "Mining", "Manu. (low-tech)", "Manu. (high-tech)", "Services"))),
                 by = c("subnational_code", "subnational_name", "sector")) %>% 
  mutate(dst_lab = gsub(" ", "~ ", dst_lab)) %>%
  ggplot(., aes(x = sector, y = value, size = value_share, colour = subnational_name)) + 
  geom_hline(yintercept = 0, lty = "dotted") + 
  geom_segment(aes(xend = sector, y = 0, yend = value), alpha = .2) +
  geom_point() + 
  ggh4x::facet_nested(type+measure~dst_lab, scale = "free_y", nest_line = T, labeller = label_parsed) + 
  labs(x = "\nSector", y = "Subnational Trade Competitiveness\n") + 
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + 
  scale_shape_manual("",
                     values = c(1, 4, 10)) +
  scale_colour_manual("", values = c("#D39200", "#00BA38", "#619CFF", "#DB72FB")) +
  scale_x_discrete(labels= function(x) highlight(x, "Overall", "black")) +
  theme(axis.text.x=element_markdown()) +
  theme(legend.position = "none") + 
  coord_cartesian(clip = "off") +
  NULL

ggsave("output/FigureF5.pdf",
       height = 20,
       width = 20, units = "cm")



# 7.6) Table F1: Summary statistics of all STC measures ####


stc1 <- as.data.frame(stc)

stc1 <- stc1[,c( "stc_sym", "stc_add", "stc_net", "stc_tba", "stc_sym_agri", 
                 "stc_add_agri", "stc_net_agri", "stc_tba_agri", "stc_sym_mini", 
                 "stc_add_mini", "stc_net_mini", "stc_tba_mini", "stc_sym_manu", 
                 "stc_add_manu", "stc_net_manu", "stc_tba_manu", "stc_sym_malt", 
                 "stc_add_malt", "stc_net_malt", "stc_tba_malt", "stc_sym_maht", 
                 "stc_add_maht", "stc_net_maht", "stc_tba_maht", "stc_sym_sr_t", 
                 "stc_add_sr_t", "stc_net_sr_t", "stc_tba_sr_t")]

stargazer::stargazer(stc1,
                     covariate.labels = c("STC (symmetric)", "STC (additive)", "STC (net)", "STC (trade balance)", 
                               "STC (symmetric agri)", "STC (additive agri)", "STC (net agri)", "STC (trade balance agri)",
                               "STC (symmetric mini)", "STC (additive mini)", "STC (net mini)", "STC (trade balance mini)",
                               "STC (symmetric manu)", "STC (additive manu)", "STC (net manu)", "STC (trade balance manu)",
                               "STC (symmetric manu lt)", "STC (additive manu lt)", "STC (net manu lt)", "STC (trade balance manu lt)",
                               "STC (symmetric manu ht)", "STC (additive manu ht)", "STC (net manu ht)", "STC (trade balance manu ht)",
                               "STC (symmetric serv)", "STC (additive serv)", "STC (net serv)", "STC (trade balance serv)"),
          digits=2, type = "latex", out = "output/TableF1.tex", title = "Summary statistics of all STC measures",
          label = "tab:desc_stat")


